# Load ZTrax Transaction data and clean following Nolte et al
# For our study, we are going to follow the recommendations for "high confidence" only.
# Note: This can only be run on Judd's computer.
# "PBNOV13" - Patrick edited the line(s) below Nov 13 (to create flags for potentially bad transactions, rather than filtering)
#

pacman::p_load(tidyverse, tidylog, data.table, readxl, maps, stringi, pbmcapply, fst)

rm(list = ls())

source("code/globals.R")



# Clean CA transactions ----

ZTrans_dir <- file.path(WORKING, "ztraxdata/ZTrans")

# Load transactions data
trans <- read_fst(file.path(ZTrans_dir, "trans.fst"), 
                  as.data.table = T)

trans <- trans %>%
  mutate(FIPS = sprintf("%05d", FIPS),
         state_fips = substr(FIPS, 1, 2))
                   

# Load transaction property data to get ImportParcelID (which would allow us to link in other property characteristics)
propTrans <- read_fst(file.path(ZTrans_dir, "propTrans.fst"), as.data.table = T)

trans <- trans %>%
  left_join(propTrans, by = "TransId")

# Create a state variable (only CA) for Nolte filtering
trans <- trans %>% mutate(state = "CA")

# APPLY NOLTE ET AL FILTERS ----

## Flag if not single family residences, condos, coops, or similar ----
trans <- trans %>%
  mutate(bad_assessment_use = !(AssessmentLandUseStndCode %in% c(
    "RR101", # SFR
    "RR999", # Inferred SFR
    # 'RR102',  # Rural Residence   (includes farm/productive land?)
    "RR104", # Townhouse
    "RR105", # Cluster Home
    "RR106", # Condominium
    "RR107", # Cooperative
    "RR108", # Row House
    "RR109", # Planned Unit Development
    "RR113", # Bungalow
    "RR116", # Patio Home
    "RR119", # Garden Home
    "RR120"
  ))) # Landominium

## Flag events that are not deed transfers ----
# Excludes mortgage records, foreclosures, etc. See documentation.
trans <- trans %>% mutate(bad_dataclass = !(DataClassStndCode %in% c("D", "H")))
# Does not flag any records in the sample data, might in the full data



## Filter on DocumentTypeStndCode ----
# Note: There is some language in here built to do this on a state-specific basis. We are only doing this for CA so much of it is not going to make a difference.

# Check
x <- trans %>%
  count(DocumentTypeStndCode, state) %>%
  arrange(-n)

# For each state, load a list of acceptable codes from Nolte et al
doctype_codes <- read_xlsx(file.path(INPUT_PUBLIC, "NolteEtAl", "LE-100-1-Nolte_appB.xlsx"),
  sheet = "DocumentTypeStndCode"
)

# For a few codes, quality is the same for every state
doctype_codes_all <- doctype_codes %>%
  filter(group != "by state") %>%
  select(DocumentTypeStndCode = code, doctype_quality_all = group)

# For the rest, quality is state-specific
doctype_codes_state <- doctype_codes %>%
  pivot_longer(cols = AL:WV) %>%
  filter(group == "by state") %>% # Keep only high quality codes
  select(DocumentTypeStndCode = code, state = name, doctype_quality_bystate = value)

# A handful of states are not represented in the document provided by Nolte et al
# For now, we fill in those state-document type ratings with the model rating
all_codes_all_states <- expand_grid(
  DocumentTypeStndCode = unique(doctype_codes_state$DocumentTypeStndCode),
  state = state.fips$abb
)

doctype_codes_state <- doctype_codes_state %>%
  right_join(all_codes_all_states, by = c("DocumentTypeStndCode", "state")) %>%
  group_by(DocumentTypeStndCode) %>%
  mutate(doctype_quality_bystate = case_when(
    !is.na(doctype_quality_bystate) ~ doctype_quality_bystate,
    TRUE ~ Mode(doctype_quality_bystate)
  )) %>%
  ungroup()

# Join both to transaction data, and keep if either the all or the by state code is high
trans <- trans %>%
  left_join(doctype_codes_all, by = c("DocumentTypeStndCode")) %>%
  left_join(doctype_codes_state, by = c("DocumentTypeStndCode", "state"))

# Flag properties without high quality document type
trans <- trans %>% 
  mutate(bad_doctype = (!is.na(doctype_quality_all) & doctype_quality_all != "high") |
  (!is.na(doctype_quality_bystate) & doctype_quality_bystate != "high"))

## Filter on LoanAmountStndCode -----

# Load codes from Nolte et al
loantype_codes <- read_xlsx(file.path(INPUT_PUBLIC, "NolteEtAl", "LE-100-1-Nolte_appB.xlsx"),
  sheet = "LoanTypeStndCode"
) %>%
  select(LoanTypeStndCode = code, loantype_quality = group)

# Flag if there is a non-empty, low quality loan type code
trans <- trans %>% left_join(loantype_codes)
trans <- trans %>% mutate(bad_loantype = !is.na(LoanTypeStndCode) & LoanTypeStndCode != "" & loantype_quality != "high")

## Flag if PartialInterestTransferStndCode not empty ----
trans <- trans %>% mutate(bad_partialinterest = !is.na(PartialInterestTransferStndCode) & PartialInterestTransferStndCode != "")

## Flag bad SalesPriceAmountStndCode ----
# Load codes from Nolte et al
salesprice_codes <- read_xlsx(file.path(INPUT_PUBLIC, "NolteEtAl", "LE-100-1-Nolte_appB.xlsx"),
  sheet = "SalesPriceAmountStndCode"
) %>%
  select(SalesPriceAmountStndCode = code, salesprice_quality = group)

# Keep if there is no sale price amount type code, it's empty, or if it's high quality
trans <- trans %>% left_join(salesprice_codes)
trans <- trans %>% mutate(bad_pricecode = !is.na(SalesPriceAmountStndCode) & SalesPriceAmountStndCode != "" & salesprice_quality != "high")

## Flag sale prices < 1001 ----
trans <- trans %>% mutate(bad_priceamt = !is.na(SalesPriceAmount) & SalesPriceAmount < 1001)

## Flag if intra-family transfer flag is Y ----

# This is Zillow's own flag for buyer/seller similarity; its construction is not publicly documented, to the best of our knowledge
trans <- trans %>% mutate(bad_ztrax_family = IntraFamilyTransferFlag == "Y")


write_fst(trans, file.path(WORKING, "trans_clean.fst"))
